import sys

import jieba
from pyecharts.charts import WordCloud
from pyecharts import options as opts
from collections import Counter

# 合并所有房型描述为一个长文本
with open("《三国演义》原版.txt", "r", encoding="utf-8") as f:
    data = [i.strip() for i in f.readlines()]
    text = " ".join(data)

    print(len(data))
    # sys.exit()


# 使用jieba分词
words = jieba.lcut(text)

# 过滤停用词（如"的"、"是"等无意义词）
with open("cn_stopwords.txt", "r", encoding="utf-8") as f:
    all_stop_words = f.readlines()

all_stop_words = [i.strip("\n") for i in all_stop_words]

filtered_words = [word for word in words if word not in all_stop_words and len(word) > 3]

# 统计词频
word_counts = Counter(filtered_words)

top_words = word_counts.most_common(300)  # 取前150高频词
print(top_words)
# sys.exit()

# 生成词云图
wordcloud = (
    WordCloud(init_opts=opts.InitOpts(width="1200px", height="600px"))
    .add(series_name="三国关键词", data_pair=top_words,

         )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="三国演义关键词"),

    )
)

# 保存为HTML文件
wordcloud.render("三国演义关键词.html")
